Principal Feature Analysis

In some applications it might be desired to pick a subset of the original features rather then find a mapping that uses all of the original features. The benefits of finding this subset of features could be in saving cost of computing unnecessary features, saving cost of sensors (in physical measurement systems), and in excluding noisy features while keeping their information using “clean” features (for example tracking points on a face using easy to track points- and inferring the other points based on those few measurements).

How does the algorithm look like:


class PFA(object):
    def __init__(self, n_features, q=None):
        self.q = q
        self.n_features = n_features

    def fit(self, X):
        if not self.q:
            self.q = X.shape[1]

        sc = StandardScaler()
        X = sc.fit_transform(X)

        pca = PCA(n_components=self.q).fit(X)
        A_q = pca.components_.T

        kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_

        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))

        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]

Results of PFA

fig <- plot_ly(x = feature_list$X1, 
               y = feature_list$X0, 
               type = 'bar', 
               orientation = 'h'
               ) %>% 
  layout(yaxis = list(categoryorder = "total ascending"))

fig
##  [1] "dissChannels.platf."       "dissChannels.prof."       
##  [3] "dissChannels.mono."        "concepts.review."         
##  [5] "adoptByPolicyHow.SQ002."   "dissChannels.peer."       
##  [7] "dissChannels.policy."      "concepts.pub."            
##  [9] "natureOfInvolvement.busi." "dissChannels.web."        
## [11] "concepts3"                 "dissChannels.conf."       
## [13] "adoptByPolicy.rate."       "adoptByPolicyHow.SQ003."  
## [15] "dissChannels.trad."        "dissChannels.socmed."     
## [17] "dissChannels.consult."     "dissChannels.events."     
## [19] "dissChannels.public."      "concepts.data."           
## [21] "concepts.code."            "concepts.infra."          
## [23] "contribToSI.rate."         "groupsInvolved.res."      
## [25] "natureOfInvolvement.res."  "contribToSI.rate."

Factor Analysis

## Loading required package: lattice
## 
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
## 
##     parallel

# --- FA explanatory
# Maximum Likelihood Factor Analysis
# entering raw data and extracting 3 factors, 
# with varimax rotation 
fit <- factanal(df_red, 8, rotation="varimax")
print(fit, digits=2, cutoff=.3, sort=TRUE)
## 
## Call:
## factanal(x = df_red, factors = 8, rotation = "varimax")
## 
## Uniquenesses:
##      transdisciplinaryExp.rate.        familiarWithSI.response. 
##                            0.76                            0.66 
##               motivation.pheno.                motivation.prob. 
##                            0.92                            0.91 
##             motivation.welfare.            benefitForNonAcademy 
##                            0.41                            0.30 
##          impulseForNonAcad.soc.         impulseForNonAcad.econ. 
##                            0.54                            0.76 
##         impulseForNonAcad.ecol.       impulseForNonAcad.health. 
##                            0.84                            0.69 
##         impulseForNonAcad.tech.            groupsInvolved.busi. 
##                            0.92                            0.69 
##          groupsInvolved.civsoc.          groupsInvolved.policy. 
##                            0.19                            0.33 
##           groupsInvolved.citiz.           groupsInvolved.media. 
##                            0.52                            0.82 
##         groupsInvolved.welfare.     natureOfInvolvement.civsoc. 
##                            0.35                            0.37 
##     natureOfInvolvement.policy.      natureOfInvolvement.citiz. 
##                            0.61                            0.63 
##      natureOfInvolvement.media.    natureOfInvolvement.welfare. 
##                            0.92                            0.44 
##     targetGroupsGoals.socneeds.    targetGroupsGoals.socgroups. 
##                            0.52                            0.55 
##      targetGroupsGoals.improve.      targetGroupsGoals.empower. 
##                            0.38                            0.49 
##    targetGroupsGoals.diversity.                       concepts2 
##                            0.63                            0.65 
##          impactTargetGroup.pub.         impactTargetGroup.busi. 
##                            0.38                            0.18 
##        impactTargetGroup.socgr.      impactTargetGroup.welfare. 
##                            0.43                            0.38 
##       impactTargetGroup.civsoc.       impactTargetGroup.policy. 
##                            0.45                            0.24 
##         impactTargetGroup.acad.               kindOfChange.pub. 
##                            0.80                            0.48 
##              kindOfChange.busi.             kindOfChange.socgr. 
##                            0.33                            0.52 
##           kindOfChange.welfare.            kindOfChange.civsoc. 
##                            0.36                            0.61 
##            kindOfChange.policy.              kindOfChange.acad. 
##                            0.33                            0.71 
##         adoptByPolicyHow.SQ001.         Impactstatements.capab. 
##                            0.79                            0.43 
##         Impactstatements.emanc. Impactstatements.understanding. 
##                            0.42                            0.38 
##         Impactstatements.mitig.       Impactstatements.unknown. 
##                            0.44                            0.60 
##   Impactstatements.unaddressed.           scalabilityRating.up. 
##                            0.77                            0.42 
##          scalabilityRating.out.         scalabilityRating.deep. 
##                            0.41                            0.33 
## 
## Loadings:
##                                 Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## impulseForNonAcad.soc.           0.57                                          
## groupsInvolved.citiz.            0.51                            0.34          
## targetGroupsGoals.socneeds.      0.53    0.34                                  
## targetGroupsGoals.socgroups.     0.57                                          
## impactTargetGroup.socgr.         0.57                                          
## Impactstatements.capab.          0.52                                    0.33  
## Impactstatements.emanc.          0.64                                          
## Impactstatements.understanding.  0.71                                          
## Impactstatements.mitig.          0.59                                          
## scalabilityRating.deep.          0.51                                          
## motivation.welfare.                      0.67                                  
## benefitForNonAcademy                     0.71                                  
## targetGroupsGoals.improve.               0.72                                  
## groupsInvolved.policy.                           0.70                          
## impactTargetGroup.policy.                        0.66                          
## kindOfChange.policy.                             0.56                          
## groupsInvolved.welfare.          0.30                    0.69                  
## natureOfInvolvement.welfare.     0.33                    0.63                  
## impactTargetGroup.welfare.                               0.60                  
## kindOfChange.welfare.                                    0.66                  
## groupsInvolved.civsoc.           0.36                            0.77          
## natureOfInvolvement.civsoc.                                      0.76          
## impactTargetGroup.civsoc.        0.34                            0.53          
## impactTargetGroup.busi.                                                  0.84  
## kindOfChange.busi.                                                       0.76  
## scalabilityRating.up.                    0.35                                  
## scalabilityRating.out.           0.30                                          
## kindOfChange.pub.                                                              
## transdisciplinaryExp.rate.       0.33                                          
## familiarWithSI.response.         0.43                                          
## motivation.pheno.                                                              
## motivation.prob.                                                               
## impulseForNonAcad.econ.                          0.33                          
## impulseForNonAcad.ecol.                                                        
## impulseForNonAcad.health.                0.47                                  
## impulseForNonAcad.tech.                                                        
## groupsInvolved.busi.                                                     0.50  
## groupsInvolved.media.                                                          
## natureOfInvolvement.policy.                      0.49                          
## natureOfInvolvement.citiz.       0.43                            0.36          
## natureOfInvolvement.media.                                                     
## targetGroupsGoals.empower.       0.45                    0.30                  
## targetGroupsGoals.diversity.     0.43                                          
## concepts2                        0.49                                          
## impactTargetGroup.pub.                   0.44                            0.35  
## impactTargetGroup.acad.                                                        
## kindOfChange.socgr.              0.48                                          
## kindOfChange.civsoc.                                             0.45          
## kindOfChange.acad.                                                             
## adoptByPolicyHow.SQ001.                                                        
## Impactstatements.unknown.        0.50                                          
## Impactstatements.unaddressed.    0.33                                          
##                                 Factor7 Factor8
## impulseForNonAcad.soc.                         
## groupsInvolved.citiz.                          
## targetGroupsGoals.socneeds.                    
## targetGroupsGoals.socgroups.                   
## impactTargetGroup.socgr.                       
## Impactstatements.capab.                        
## Impactstatements.emanc.                        
## Impactstatements.understanding.                
## Impactstatements.mitig.                        
## scalabilityRating.deep.          0.50          
## motivation.welfare.                            
## benefitForNonAcademy                           
## targetGroupsGoals.improve.                     
## groupsInvolved.policy.                         
## impactTargetGroup.policy.        0.36          
## kindOfChange.policy.                     0.47  
## groupsInvolved.welfare.                        
## natureOfInvolvement.welfare.                   
## impactTargetGroup.welfare.       0.34          
## kindOfChange.welfare.                    0.32  
## groupsInvolved.civsoc.                         
## natureOfInvolvement.civsoc.                    
## impactTargetGroup.civsoc.                      
## impactTargetGroup.busi.                        
## kindOfChange.busi.                             
## scalabilityRating.up.            0.54          
## scalabilityRating.out.           0.53          
## kindOfChange.pub.                        0.62  
## transdisciplinaryExp.rate.                     
## familiarWithSI.response.                       
## motivation.pheno.                              
## motivation.prob.                               
## impulseForNonAcad.econ.                        
## impulseForNonAcad.ecol.                        
## impulseForNonAcad.health.                      
## impulseForNonAcad.tech.                        
## groupsInvolved.busi.                           
## groupsInvolved.media.                          
## natureOfInvolvement.policy.                    
## natureOfInvolvement.citiz.                     
## natureOfInvolvement.media.                     
## targetGroupsGoals.empower.                     
## targetGroupsGoals.diversity.                   
## concepts2                                      
## impactTargetGroup.pub.           0.41          
## impactTargetGroup.acad.          0.35          
## kindOfChange.socgr.                      0.42  
## kindOfChange.civsoc.                           
## kindOfChange.acad.                       0.43  
## adoptByPolicyHow.SQ001.                        
## Impactstatements.unknown.                      
## Impactstatements.unaddressed.                  
## 
##                Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7 Factor8
## SS loadings       6.56    2.96    2.75    2.72    2.64    2.41    2.23    1.74
## Proportion Var    0.13    0.06    0.05    0.05    0.05    0.05    0.04    0.03
## Cumulative Var    0.13    0.18    0.24    0.29    0.34    0.39    0.43    0.46
## 
## Test of the hypothesis that 8 factors are sufficient.
## The chi square statistic is 2419.95 on 938 degrees of freedom.
## The p-value is 2.03e-131
# plot factor 1 by factor 2 
#load <- fit$loadings[,1:2] 
#plot(load,type="n") # set up plot 
#text(load,labels=names(df_red),cex=.7) # add variable names


#fit$loadings

2 Confirmatory Factor Analysis Model

1. Theory driven model

model_theory <-"
  ## SI Familiarity
  fam =~ familiarWithSI.response.+transdisciplinaryExp.rate.
  
  ## intention_agency
  ia_human_condition =~ motivation.welfare.+benefitForNonAcademy+impulseForNonAcad.soc.+targetGroupsGoals.improve.+impulseForNonAcad.health.+impulseForNonAcad.ecol.
  
  ia_non_academic =~ impulseForNonAcad.econ.+impulseForNonAcad.tech.
  
  ## transdisciplinary_apects
  transdisciplinary_social =~ groupsInvolved.citiz.+groupsInvolved.civsoc.+groupsInvolved.welfare.+natureOfInvolvement.citiz.+natureOfInvolvement.civsoc.+natureOfInvolvement.welfare.+targetGroupsGoals.socneeds.+targetGroupsGoals.socgroups.+targetGroupsGoals.empower.+targetGroupsGoals.diversity.
  
  ## outcome
  outcome_public =~ impactTargetGroup.pub.+impactTargetGroup.socgr.+impactTargetGroup.welfare.+impactTargetGroup.civsoc.+kindOfChange.pub.+kindOfChange.socgr.+kindOfChange.welfare.+kindOfChange.civsoc.
  
  outcome_statement =~ Impactstatements.capab.+Impactstatements.emanc.+Impactstatements.understanding.+Impactstatements.mitig.+Impactstatements.unknown.+Impactstatements.unaddressed.
  
  ## MISC:scalability
  scale =~ scalabilityRating.up.+scalabilityRating.out.+scalabilityRating.deep.
  
  ## MISC:policy
  policy =~ groupsInvolved.policy.+impactTargetGroup.policy.+kindOfChange.policy.+natureOfInvolvement.policy.+groupsInvolved.policy.+adoptByPolicyHow.SQ001. 
  
  ## MISC:business
  busi =~ groupsInvolved.busi.+impactTargetGroup.busi.+kindOfChange.busi.
"

2 EFA driven model

model_efa <- "
  f1 =~ impulseForNonAcad.soc.+groupsInvolved.citiz.+targetGroupsGoals.socneeds.+targetGroupsGoals.socgroups.+impactTargetGroup.socgr.+Impactstatements.capab.+Impactstatements.emanc.+Impactstatements.understanding.+Impactstatements.mitig.+transdisciplinaryExp.rate.+familiarWithSI.response. 
  f2 =~ motivation.welfare.+benefitForNonAcademy+targetGroupsGoals.improve.+impulseForNonAcad.health.+impactTargetGroup.pub. 
  f3 =~ groupsInvolved.policy.+impactTargetGroup.policy.+kindOfChange.policy.+natureOfInvolvement.policy.+impulseForNonAcad.econ.+natureOfInvolvement.policy.
  f4 =~ groupsInvolved.welfare.+natureOfInvolvement.welfare.+impactTargetGroup.welfare.+kindOfChange.welfare. 
  f5 =~ groupsInvolved.civsoc.+natureOfInvolvement.civsoc.+impactTargetGroup.civsoc.+kindOfChange.civsoc.
  f6 =~ impactTargetGroup.busi.+kindOfChange.busi.+groupsInvolved.busi.
  f7 =~ scalabilityRating.up.+scalabilityRating.out.+scalabilityRating.deep.
  f8 =~ kindOfChange.pub.+kindOfChange.socgr.+kindOfChange.acad.
"
## Warning in lav_object_post_check(object): lavaan WARNING: some estimated lv
## variances are negative

Optimization opportunities?

modificationindices(fit_theory) %>% arrange(-mi) %>% head(10)
##                          lhs op                          rhs      mi    epc
## 1             outcome_public =~   transdisciplinaryExp.rate. 248.724  4.705
## 2          outcome_statement =~   transdisciplinaryExp.rate. 229.872  3.208
## 3   transdisciplinary_social =~   transdisciplinaryExp.rate. 195.396 15.619
## 4   familiarWithSI.response. ~~         benefitForNonAcademy 174.404 -1.477
## 5                      scale =~   transdisciplinaryExp.rate. 146.331  2.080
## 6     groupsInvolved.civsoc. ~~  natureOfInvolvement.civsoc. 144.541  0.098
## 7    groupsInvolved.welfare. ~~ natureOfInvolvement.welfare. 131.410  0.151
## 8   familiarWithSI.response. ~~      impactTargetGroup.busi. 121.142 -6.324
## 9  impactTargetGroup.civsoc. ~~         kindOfChange.civsoc. 116.426  0.556
## 10        ia_human_condition =~   transdisciplinaryExp.rate. 112.005  2.508
##    sepc.lv sepc.all sepc.nox
## 1    8.837    2.895    2.895
## 2    7.442    2.438    2.438
## 3    6.745    2.209    2.209
## 4   -1.477   -1.772   -1.772
## 5    6.067    1.987    1.987
## 6    0.098    0.663    0.663
## 7    0.151    0.645    0.645
## 8   -6.324   -2.907   -2.907
## 9    0.556    0.607    0.607
## 10   6.439    2.109    2.109
modificationindices(fit_efa) %>% arrange(-mi) %>% head(10)
##                           lhs op                          rhs      mi   epc
## 1      groupsInvolved.civsoc. ~~  natureOfInvolvement.civsoc. 114.737 0.076
## 2     groupsInvolved.welfare. ~~ natureOfInvolvement.welfare.  85.547 0.126
## 3      groupsInvolved.policy. ~~  natureOfInvolvement.policy.  78.328 0.159
## 4    impactTargetGroup.socgr. ~~          kindOfChange.socgr.  71.901 0.514
## 5   impactTargetGroup.civsoc. ~~         kindOfChange.civsoc.  66.679 0.385
## 6                          f1 =~      scalabilityRating.deep.  65.994 7.957
## 7                          f6 =~       impactTargetGroup.pub.  64.042 0.470
## 8                          f6 =~      Impactstatements.capab.  58.973 0.415
## 9  impactTargetGroup.welfare. ~~        kindOfChange.welfare.  55.633 0.554
## 10                         f7 =~       impactTargetGroup.pub.  49.782 0.503
##    sepc.lv sepc.all sepc.nox
## 1    0.076    0.739    0.739
## 2    0.126    0.911    0.911
## 3    0.159    0.522    0.522
## 4    0.514    0.521    0.521
## 5    0.385    0.690    0.690
## 6    1.893    0.610    0.610
## 7    1.223    0.393    0.393
## 8    1.078    0.342    0.342
## 9    0.554    0.496    0.496
## 10   1.486    0.477    0.477

Which model is better?

anova(fit_theory, fit_efa)
## Warning in lavTestLRT(object = object, ..., model.names = NAMES): lavaan
## WARNING: some models are based on a different set of observed variables
## Chi-Squared Difference Test
## 
##             Df   AIC   BIC  Chisq Chisq diff Df diff Pr(>Chisq)    
## fit_efa    637 37684 38089 2534.8                                  
## fit_theory 909 42713 43203 3688.2     1153.4     272  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The comparison between Models and (self assesment) SI rate

DF.refined <- data.frame(predict(fit_theory))
## Warning in lav_object_post_check(object): lavaan WARNING: some estimated lv
## variances are negative
a <- scales::rescale(apply(DF.refined, mean, MARGIN = 1), to = c(0, 10))
b <- feat_df.num_o$contribToSI.rate.
c <- 1:length(a)

ab <- as.data.frame(cbind(a,b,c))
library(ggplot2)
ggplot(ab, aes(x = c)) + 
  geom_line(aes(y = a), color = "darkred") + 
  geom_line(aes(y = b), color="steelblue") 

DF.refined2 <- data.frame(predict(fit_efa))

d <- scales::rescale(apply(DF.refined2, mean, MARGIN = 1), to = c(0, 10))

db <- as.data.frame(cbind(d,b,c))
ggplot(db, aes(x = c)) + 
  geom_line(aes(y = d), color = "darkred") + 
  geom_line(aes(y = b), color="steelblue") 

var_prop_efa <- c(0.13,  0.06,    0.05,    0.05,    0.05,    0.05,    0.04,    0.03)
new_DF_refined2 <- as.data.frame( as.matrix(DF.refined2) %*% diag(var_prop_efa))

f <- scales::rescale(apply(new_DF_refined2, mean, MARGIN = 1), to = c(0, 10))
data <- data.frame(a,b,c,d,f)
fig <- plot_ly(data, x = ~c, y = ~a, name = 'theory', type = 'bar') 
fig <- fig %>% add_trace(y = ~b, name = 'contribToSI.rate', type = 'bar') 
fig <- fig %>% add_trace(y = ~d, name = 'efa', type = 'bar')
fig <- fig %>% add_trace(y = ~f, name = 'efa_scaled', type = 'bar')

fig
  • Neue Varible: log(pfa)
feature_list_red <- feature_list[!(feature_list$X0 %in% features_to_rm),]
feature_list_red$logX <-  log(feature_list_red$X1)
df_red2 <- df_red
for (i in seq_along(feature_list_red$logX)) {
  df_red2[, i] <- df_red2[, i] * feature_list_red$logX[i]
}

g <- scales::rescale(apply(df_red2, mean, MARGIN = 1), to = c(0, 10))
data$g <- g


library(plotly) 
fig1 <- plot_ly(x = data$c, y = data$a, type = 'scatter', mode = 'lines', name="theory") 
fig2 <- plot_ly(x = data$c, y = data$b, type = 'scatter', mode = 'lines', name="contribToSI") 
fig3 <- plot_ly(x = data$c, y = data$d, type = 'scatter', mode = 'lines', name="efa") 
fig4 <- plot_ly(x = data$c, y = data$f, type = 'scatter', mode = 'lines', name="efa_scaled") 
fig5 <- plot_ly(x = data$c, y = data$g, type = 'scatter', mode = 'lines', name="log(pfa)") 
fig <- subplot(fig1, fig2, fig3, fig4, fig5, nrows = 5) %>% 
  layout(title = list(text = ""),
         plot_bgcolor='#e5ecf6', 
         xaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'), 
         yaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff')) 
fig
mean(aggregate( a ~ b , data = data , sum , na.rm = TRUE )$a)
## [1] 85.35234
mean(aggregate( d ~ b , data = data , sum , na.rm = TRUE )$d)
## [1] 85.77354
mean(aggregate( f ~ b , data = data , sum , na.rm = TRUE )$f)
## [1] 87.05012
mean(aggregate( g ~ b , data = data , sum , na.rm = TRUE )$g)
## [1] 112.6411
mean(a-b)
## [1] 0.8943928
mean(d-b)
## [1] 0.9072271
mean(f-b)
## [1] 0.9461254
mean(g-b)
## [1] 1.725907